#' ---
#' title: Merge PPP Datasets (Kaufman & Klevs 2022)
#' author: Joe Ornstein
#' date: 2025-06-20
#' version: 0.21
#' ---

library(tidyverse)
library(fuzzylink)

## Load and clean data from PPP (Kaufman & Klevs 2022) -----------------

# original dataverse link: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/4031UL
ppp <- read_csv('raw/150k_plus_cities.csv') |>
  mutate(id = 1:n())

# dataset B is a list of all US cities
cities <- read_csv('raw/uscities.csv') |>
  mutate(city = str_to_title(city)) |>
  # fuzzy matching variable is 'city, state_id'
  mutate(name = paste0(city, ', ', state_id))

ppp <- ppp |>
  # fuzzy matching variable is 'city, state_id'
  mutate(city = str_to_title(City)) |>
  select(id, city, state_id = State) |>
  mutate(name = paste0(city, ', ', state_id)) |>
  # keep unique city names
  distinct(name, .keep_all = TRUE) |>
  # remove exact matches
  filter(!(name %in% unique(cities$name)))

## fuzzylink() ----------------

# approximately 17 minutes to run each merge
# first is the default fuzzylink()
df <- fuzzylink(ppp, cities,
                by = 'name',
                blocking.variables = 'state_id',
                record_type = 'US city or CDP',
                instructions = 'Some names may contain typos or mispellings.')
save(df, file = 'data/cities-merge/cities_fuzzylink.RData')

# second returns *all* within-block record pairs, for calibration analysis
df <- fuzzylink(ppp, cities,
                by = 'name',
                blocking.variables = 'state_id',
                record_type = 'US city or CDP',
                instructions = 'Some names may contain typos or mispellings.',
                return_all_pairs = TRUE)

save(df, file = 'data/cities-merge/cities_fuzzylink_all-pairs.RData')

### AFSM (No Embeddings) ----------------------------

# Now we'll compare this with two variants of AFSM:
# 1. The exact AFSM algorithm from Kaufman & Klevs (2022); one n=500 HITL step
# 2. Adding embedding similarity as a predictor, otherwise exactly the same

# devtools::install_github('aaronrkaufman/stringmatch')
library(stringmatch)

# base_training is the ~70,000 labeled string pairs from the amicus-bonica train set
base_training <- read_csv('raw/full_train_set.csv')
m = ranger::ranger(x = base_training |> select(cosine:overlap),#select(osa:soundex),
                   y = factor(base_training$label),
                   probability = TRUE)

# convert A and B to all-caps for AFSM
load('data/cities-merge//cities_fuzzylink_all-pairs.RData')
df <- df |>
  mutate(A_caps = str_to_upper(str_remove(A, ",\\s*[A-Z]{2}$")),
         B_caps = str_to_upper(str_remove(B, ",\\s*[A-Z]{2}$")))

# function to compute overlap str distance metric (not available in stringdist)
get_overlap <- function(str1, str2){
  # Convert strings to character vectors
  chars1 <- unique(unlist(strsplit(str1, split = "")))
  chars2 <- unique(unlist(strsplit(str2, split = "")))

  # Find the intersection of the two character vectors
  shared_chars <- intersect(chars1, chars2)

  # denominator is the smaller of the two sets
  denominator <- pmin(length(chars1), length(chars2))

  return(length(shared_chars) / denominator)
}

# get suite of lexical string distance metrics identified in Appendix A
df$cosine = stringdist::stringsim(df$A_caps, df$B_caps, method = "cosine")
df$jaccard = stringdist::stringsim(df$A_caps, df$B_caps, method = "jaccard")
df$levenshtein = stringdist::stringdist(df$A_caps, df$B_caps, method = 'lv')
df$lcsstr = stringdist::stringdist(df$A_caps, df$B_caps, method = "lcs")
df$overlap <- mapply(get_overlap, str1 = df$A_caps, str2 = df$B_caps)

# "Basic Score" is the predicted match probability from the Base Model
system.time(
  df$basic_score <- predict(m, df)$predictions[,2]
)
# roughly 3.5 minutes to compute predictions

# extract the top 500 for a HITL loop
set.seed(42)
to_label <- df |>
  slice_max(basic_score, n = 500, with_ties = FALSE)

write_csv(to_label, file = 'data/cities-merge/hitl_pairs_no_embeddings.csv')

# add labels to the train set
labeled <- read_csv('data/cities-merge/hitl_pairs_no_embeddings_labeled.csv')

train <- bind_rows(base_training, labeled)

# refit model
m = ranger::ranger(x = train |> select(cosine:overlap),#select(osa:soundex),
                   y = factor(train$label),
                   probability = TRUE)

# "HITL Score" is the predicted match probability from the refined model
system.time(
  df$hitl_score <- predict(m, df)$predictions[,2]
)
# about 5.5 minutes

# add labels from HITL loop
df <- df |>
  left_join(labeled |>
              select(A_caps, B_caps, state_id, label))

save(df, file = 'data/cities-merge/cities_afsm_no_embeddings.RData')

### AFSM (With Embeddings) ----------------------------

# get embeddings for the base_training set
emb <- get_embeddings(unique(c(base_training$amicus, base_training$bonica)))

# add embedding similarity to base_training
base_training$sim <- rowSums(emb[base_training$amicus,] * emb[base_training$bonica,])

# fit random forest
m = ranger::ranger(x = base_training |> select(cosine:sim),
                   y = factor(base_training$label),
                   probability = TRUE,
                   importance = 'impurity')
ranger::importance(m)

# "Basic Score" is the predicted match probability from the Base Model
system.time(
  df$basic_score <- predict(m, df)$predictions[,2]
)
# roughly 4.5 minutes

# extract the top 500 for a HITL loop
set.seed(42)
to_label <- df |>
  slice_max(basic_score, n = 500, with_ties = FALSE)

# see if any of the previously-labeled pairs can help us
previously_labeled <- read_csv('data/cities-merge/hitl_pairs_no_embeddings_labeled.csv') |>
  select(A_caps, B_caps, state_id, label)

to_label <- left_join(to_label, previously_labeled, by = c('A_caps', 'B_caps', 'state_id'))

write_csv(to_label, file = 'data/cities-merge/hitl_pairs_with_embeddings.csv')

# add labels to the train set
labeled <- read_csv('data/cities-merge/hitl_pairs_with_embeddings_labeled.csv')
train <- bind_rows(base_training, labeled)

# refit model
m = ranger::ranger(x = train |> select(cosine:sim),
                   y = factor(train$label),
                   probability = TRUE,
                   importance = 'impurity')
ranger::importance(m)

# "HITL Score" is the predicted match probability from the refined model
system.time(
  df$hitl_score <- predict(m, df)$predictions[,2]
)
# about 4.8 minutes

# add labels from HITL loop
df <- df |>
  left_join(labeled |> select(A,B,label))

save(df, file = 'data/cities-merge/cities_afsm_with_embeddings.RData')